#importing the libraries for numpy, pandas, seaborn, warnings
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import pandas_profiling
import statsmodels.api as sm
import statsmodels.formula.api as smf
#importing the libraries for svm, SVC, GridSearchCV, pyplot, metrics
from sklearn import svm
from sklearn.svm import SVC
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, average_precision_score,recall_score,precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error,classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import preprocessing
from IPython.display import Image
from IPython.core.display import HTML
plt.rc("font", size=14)
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)
#print multiple statements in same line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity ="all"
warnings.filterwarnings('ignore')
%matplotlib inline
Attribute information:
For more information, read [Cortez et al., 2009].
Input variables (based on physicochemical tests):
1 - fixed acidity (tartaric acid - g / dm^3)
2 - volatile acidity (acetic acid - g / dm^3)
3 - citric acid (g / dm^3)
4 - residual sugar (g / dm^3)
5 - chlorides (sodium chloride - g / dm^3
6 - free sulfur dioxide (mg / dm^3)
7 - total sulfur dioxide (mg / dm^3)
8 - density (g / cm^3)
9 - pH
10 - sulphates (potassium sulphate - g / dm3)
11 - alcohol (% by volume)
Output variable (based on sensory data):
12 - quality (score between 0 and 10)
Missing Attribute Values: None
Description of attributes:
1 - fixed acidity: most acids involved with wine or fixed or nonvolatile (do not evaporate readily)
2 - volatile acidity: the amount of acetic acid in wine, which at too high of levels can lead to an unpleasant, vinegar taste
3 - citric acid: found in small quantities, citric acid can add 'freshness' and flavor to wines
4 - residual sugar: the amount of sugar remaining after fermentation stops, it's rare to find wines with less than 1 gram/liter and wines with greater than 45 grams/liter are considered sweet
5 - chlorides: the amount of salt in the wine
6 - free sulfur dioxide: the free form of SO2 exists in equilibrium between molecular SO2 (as a dissolved gas) and bisulfite ion; it prevents microbial growth and the oxidation of wine
7 - total sulfur dioxide: amount of free and bound forms of S02; in low concentrations, SO2 is mostly undetectable in wine, but at free SO2 concentrations over 50 ppm, SO2 becomes evident in the nose and taste of wine
8 - density: the density of water is close to that of water depending on the percent alcohol and sugar content
9 - pH: describes how acidic or basic a wine is on a scale from 0 (very acidic) to 14 (very basic); most wines are between 3-4 on the pH scale
10 - sulphates: a wine additive which can contribute to sulfur dioxide gas (S02) levels, wich acts as an antimicrobial and antioxidant
11 - alcohol: the percent alcohol content of the wine
Output variable (based on sensory data): 12 - quality (score between 0 and 10)
#import CSV file
df_white=pd.read_csv('winequality-white.csv', delimiter=';')
df_white.head()
df_white.shape
df_white.info()
pandas_profiling.ProfileReport(df_white)
n_wines = df_white.shape[0]
# Number of wines with quality rating above 6
quality_above_6 = df_white.loc[(df_white['quality'] > 6)]
n_above_6 = quality_above_6.shape[0]
# Number of wines with quality rating below 5
quality_below_5 = df_white.loc[(df_white['quality'] < 5)]
n_below_5 = quality_below_5.shape[0]
# Number of wines with quality rating between 5 to 6
quality_between_5 = df_white.loc[(df_white['quality'] >= 5) & (df_white['quality'] <= 6)]
n_between_5 = quality_between_5.shape[0]
# Percentage of wines with quality rating above 6
greater_percent = n_above_6*100/n_wines
# Print the results
print("Total number of white wine data: {}".format(n_wines))
print("White Wines with rating 7 and above: {}".format(n_above_6))
print("White Wines with rating less than 5: {}".format(n_below_5))
print("White Wines with rating 5 and 6: {}".format(n_between_5))
print("Percentage of white wines with quality 7 and above: {:.2f}%".format(greater_percent))
# Some more additional data analysis
display(np.round(df_white.describe()))
print("white wine mean = ",df_white["quality"].mean())
pd.plotting.scatter_matrix(df_white, alpha = 0.3, figsize = (40,40), diagonal = 'kde');
df_w_corr=df_white.corr()
df_w_corr
plt.subplots(figsize=(20,15))
ax = plt.axes()
ax.set_title("Wine Characteristic Correlation Heatmap (Reds)")
corr = df_white.corr().round(2)
sns.heatmap(corr,annot=True,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values,
cmap="Purples")
Fixed Acidity
Titratable acidity, sometimes referred to as fixed acidity, is a measurement of the total concentration of titratable acids and free hydrogen ions present in your wine. A litmus paper can be used to identify whether a given solution is acidic or basic. The most common titratable acids are tartaric, malic, citric and carbonic acid. These acids, along with many more in smaller quantities, either occur naturally in the grapes or are created through the fermentation process.
PH
pH stands for power of hydrogen, which is a measurement of the hydrogen ion concentration in the solution. Generally, solutions with a pH value less than 7 are considered acidic, with some of the strongest acids being close to 0. Solutions above 7 are considered alkaline or basic. The pH value of water is 7, as it is neither an acid nor a base.
#Visualize the co-relation between pH and fixed Acidity
#Create a new dataframe containing only pH and fixed acidity columns to visualize their co-relations
fixedAcidity_pH = df_white[['pH', 'fixed_acidity']]
#Initialize a joint-grid with the dataframe, using seaborn library
gridA = sns.JointGrid(x="fixed_acidity", y="pH", data=fixedAcidity_pH, size=6)
#Draws a regression plot in the grid
gridA = gridA.plot_joint(sns.regplot, scatter_kws={"s": 10})
#Draws a distribution plot in the same grid
gridA = gridA.plot_marginals(sns.distplot)
Based on PH value which seems to be between 2.8 to 4.0 which is lower than 7 base PH value. So it will be soar in taste as the nature of white wine.
Fixed Acidity
Titratable acidity, sometimes referred to as fixed acidity, is a measurement of the total concentration of titratable acids and free hydrogen ions present in your wine. A litmus paper can be used to identify whether a given solution is acidic or basic. The most common titratable acids are tartaric, malic, citric and carbonic acid. These acids, along with many more in smaller quantities, either occur naturally in the grapes or are created through the fermentation process.
Citric Acid
Citric acid is generally found in very small quantities in wine grapes. It acts as a preservative and is added to wines to increase acidity, complement a specific flavor or prevent ferric hazes. It can be added to finished wines to increase acidity and give a “fresh” flavor. Excess addition, however, can ruin the taste.
fixedAcidity_citricAcid = df_white[['citric_acid', 'fixed_acidity']]
g = sns.JointGrid(x="fixed_acidity", y="citric_acid", data=fixedAcidity_citricAcid, size=6)
g = g.plot_joint(sns.regplot, scatter_kws={"s": 10})
g = g.plot_marginals(sns.distplot)
Also Citric Acid is in between 0 to 1.25 which seems to be an average. and Fixed acidity occur naturally in the grapes or are created through the fermentation process
Volatile Acidity
Volatile acidity (VA) is a measure of the wine's volatile (or gaseous) acids. The primary volatile acid in wine is acetic acid, which is also the primary acid associated with the smell and taste of vinegar.
Volatile acidity concentration is regulated by the federal Tax and Trade Bureau, and allowable levels for various wine styles can be found in the Code of Federal Regulations (CFR). In general, per the CFR: "The maximum volatile acidity, calculated as acetic acid and exclusive of sulfur dioxide, is 0.14 g/100 mL for red wine and 0.12 g/100 mL for white wines."
volatileAcidity_quality = df_white[['volatile_acidity', 'quality']]
fig, axs = plt.subplots(ncols=1,figsize=(10,6))
sns.barplot(x='quality', y='volatile_acidity', data=volatileAcidity_quality, ax=axs)
plt.title('quality VS volatile acidity')
plt.tight_layout()
plt.show()
plt.gcf().clear()
Alcohol vs. Quality
df_white["quality"] = pd.Categorical(df_white["quality"])
sns.countplot(x="quality", data=df_white)
plt.xlabel("Quality level of wine (0-10 scale)")
plt.show()
sns.factorplot(x="quality", y="alcohol", data=df_white, kind="strip")
plt.xlabel("Quality level of wine, 0-10 scale")
plt.ylabel("Alcohol level in wine, % ABV")
plt.title("Alcohol percent in each level of white wine's quality")
plt.show()
quality_alcohol = df_white[['quality', 'alcohol']]
fig, axs = plt.subplots(ncols=1,figsize=(10,6))
sns.barplot(x='quality', y='alcohol', data=quality_alcohol, ax=axs)
plt.title('quality VS alcohol')
plt.tight_layout()
plt.show()
plt.gcf().clear()
#Making binary classificaion for the response variable.
#Dividing wine as good and bad by giving the limit for the quality
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
df_white['quality'] = pd.cut(df_white['quality'], bins = bins, labels = group_names)
df_white['quality'].value_counts()
sns.countplot(df_white['quality'])
# Categorical into 1 quality:
df_white['quality'] = pd.Categorical(df_white['quality']).codes
#Create Features and Labels
y = df_white['quality']
x = df_white.drop('quality', axis=1)
df_white.head(10)
#Create train and test 70% and 30% split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.30, random_state=42)
X_train.shape
X_test.shape
y_train.shape
y_test.shape
type(X_train)
#Fit LDA on training data
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
sklearn_lda = LinearDiscriminantAnalysis()
transf_lda = sklearn_lda.fit_transform(X_train, y_train)
# Explained Variance
str(sklearn_lda.explained_variance_ratio_)
# Use LDA to make prediction on test data
print('Confusion Matrix on the LDA-Classifier')
print(metrics.confusion_matrix(y_test, sklearn_lda.predict(X_test)))
# Compute Accuracy of classification
pred_test_lda = sklearn_lda.predict(X_test)
metrics.accuracy_score(y_test,pred_test_lda)
lda = LinearDiscriminantAnalysis().fit(x,y)
df=pd.DataFrame(lda.scalings_, index=x.columns, columns=('LD1','LD2'))
df
# Explained Variance
lda.explained_variance_ratio_
# Use LDs as input in Logistic Regression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from mlxtend.plotting import plot_decision_regions
lda = LDA()
X_train_lda = lda.fit_transform(x,y)
y= np.array(y)
logregr=LogisticRegression()
logregr.fit(X_train_lda,y)
plt.figure(figsize=(15,15))
plot_decision_regions(X_train_lda, y, clf = logregr)
plt.xlabel('LD 1')
plt.ylabel('LD 2')
plt.legend(loc='lower left')
plt.show()
svm_clf = svm.SVC()
svm_clf.fit(X_train, y_train)
The following example demonstrates how to estimate the accuracy of a linear kernel support vector machine on the iris dataset by splitting the data, fitting a model and computing the score 5 consecutive times (with different splits each time):
from sklearn.model_selection import ShuffleSplit
n_samples = x.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
cross_val_score(svm_clf, x,y, cv=cv)
from sklearn.pipeline import make_pipeline
clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
scores = cross_val_score(clf, x,y, cv=cv)
scores
print('The mean score and the 95% confidence interval of the score estimate are hence given by:')
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))
from sklearn.model_selection import cross_validate
from sklearn.metrics.scorer import make_scorer
scoring = {'prec_macro': 'precision_macro','rec_micro': make_scorer(recall_score, average='macro')}
scores = cross_validate(svm_clf, x, y, scoring=scoring, cv=5, return_train_score=True)
sorted(scores.keys())
scores
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
clf = svm.SVC(C=1).fit(X_train_transformed, y_train)
X_test_transformed = scaler.transform(X_test)
clf.score(X_test_transformed, y_test)
ytrain_pred = svm_clf.predict(X_train)
print("In-sample Mean squared error: %.2f"
% mean_squared_error(y_train, ytrain_pred))
ypred1 = svm_clf.predict(X_test)
print("Out-of-sample Mean squared error: %.2f"
% mean_squared_error(y_test, ypred1))
cf = confusion_matrix(y_test,svm_clf.predict(X_test))
svm_clfPerformance = precision_recall_fscore_support(y_test,svm_clf.predict(X_test))
print ('Cross-validation of :Support Vector Machine')
print('Precision: = {0}'.format(svm_clfPerformance[0]))
print('Recall: = {0}'.format(svm_clfPerformance[1]))
print('Fscore: = {0}'.format(svm_clfPerformance[2]))
print('Support: = {0}'.format(svm_clfPerformance[3]))
print('Confusion Matrix:\n',cf)
print ('****')
#Confusion Matrix
print("Confusion Matrix")
lbl1 = ["Predicted 0","Predicted 1"]
lbl2 = ["True 0", "True 1"]
sns.heatmap(cf, annot=True, cmap="Greens", fmt="d", xticklabels=lbl1, yticklabels=lbl2)
plt.show()
param = { 'C': [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4], 'kernel':['linear', 'rbf'], 'gamma' :[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4] }
#Finding best parameters for our SVC model
svc = SVC()
svc.get_params()
param = {
'C': [0.1,0.8,0.9,1,1.1,1.2,1.3,1.4],
'kernel':['linear', 'rbf'],
'gamma' :[0.1,0.8,0.9,1,1.1,1.2,1.3,1.4]
}
grid_svc = RandomizedSearchCV(svc, param_distributions=param, scoring='accuracy', cv=10)
grid_svc.fit(X_train, y_train)
#Best parameters for our svc model
grid_svc.best_params_
#Best score for our svc model
grid_svc.best_score_
paramgrid = { 'C': [1.2], 'kernel':['rbf'], 'gamma' :[0.9] }
paramgrid = {
'C': [1.2],
'kernel':['rbf'],
'gamma' :[0.9]
}
grid_svc1 = GridSearchCV(svc, paramgrid, scoring='accuracy', cv=5)
grid_svc1.fit(x, y)
#Best score for our svc model
grid_svc1.best_score_
grid_svc1.fit(X_train, y_train)
#Best score for our svc model
grid_svc1.best_score_
#Let's run our SVC again with the best parameters.
svc2 = SVC(C = 1.2, gamma = 0.9, kernel= 'rbf',)
svc2.fit(X_train, y_train)
pred_svc2 = svc2.predict(X_test)
print(classification_report(y_test, pred_svc2))